import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.model_selection import train_test_split
"ggplot") plt.style.use(
Linear regression is the most common supervised models, you are using \(X\) to supervise the prediction of \(Y\). In contrast, unsupervised model doesn’t rely on any extra information to make predictions, such classification.
Overfitting
How to overfit a model? Just use more parameters than necessary. Here is an example.
Too many parameters renders the model ridiculous for prediction.
12)
np.random.seed(
def overfitting_example(u=np.random.normal(0, 1, 100), poly_order=4):
= np.linspace(0, 8, 100)
X = 3 * X + u
Y
= X[:80], Y[:80]
X_train, Y_train = X[80:], Y[80:]
X_test, Y_test
= np.poly1d(np.polyfit(X_train, Y_train, poly_order))
p4 = np.linspace(0, 8)
X_ax = r2_score(Y_train, p4(X_train))
R_sqr_train = r2_score(Y_test, p4(X_test))
R_sqr_test
= plt.subplots(figsize=(12, 6))
fig, ax ="Train")
ax.scatter(X_train, Y_train, label="Test")
ax.scatter(X_test, Y_test, label
ax.plot(X_ax, p4(X_ax))
ax.set_title("Overfitting Example, Training $R^2:{:.2f}$, Test $R^2_1:{:.2f}$".format(
R_sqr_train, R_sqr_test
)
)
ax.legend()
plt.show()
if __name__ == "__main__":
=np.random.normal(0, 3, 100), poly_order=4) overfitting_example(u
Classification
K-mean Clustering
We will use artificial data to demonstrate the idea of K-mean clustering, two centroids will be generated.
def gen_cluster_data(N, k):
# N observations, k clusters
= N / k
points_per_cluster = []
X for i in range(k):
= np.random.uniform(30, 50)
centroid_alpha = np.random.uniform(200, 250)
centroid_beta for j in range(int(points_per_cluster)):
X.append(
[50),
np.random.normal(centroid_alpha, 40),
np.random.normal(centroid_beta,
]
)= np.array(X)
X return X
= 2000, 4
N, k = gen_cluster_data(N, k)
data_cluster = KMeans(n_clusters=k)
model = model.fit(
results
scale(data_cluster)# normalize the data for unitless interpretation )
What we really need is stored in the labels_
, it helps us to color the plots too.
results.labels_
array([0, 0, 0, ..., 1, 3, 0], dtype=int32)
Entropy
The term Entropy measures the state of order in a data set, if the data has many different classes, the entropy will be high, vice verse low.
The entropy is calculated by \[ H(S)=-p_1 \ln p_1-\cdots-p_n \ln p_n \] \(p_i\) represents the proportion of each lass.
Support Vector Machine
SVM is for classifying higher-dimensional data, mathematically support vectors can help defining hyperplane.
def gen_cluster_data(N, k):
# N observations, k clusters
= N / k
points_per_cluster = [], []
X, y for i in range(k):
= np.random.uniform(30, 80)
centroid_alpha = np.random.uniform(200, 250)
centroid_beta for j in range(int(points_per_cluster)):
X.append(
[3),
np.random.normal(centroid_alpha, 3),
np.random.normal(centroid_beta,
]
)
y.append(i)= np.array(X)
X = np.array(y)
y return X, y
= gen_cluster_data(1000, 10)
X, y = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
fig, ax 0].scatter(X[:, 0], X[:, 1], c=y)
ax[
= MinMaxScaler(feature_range=(-1, 1)).fit(X)
results_scaled = results_scaled.transform(X)
X 1].scatter(X[:, 0], X[:, 1], c=y)
ax[ plt.show()
= svm.SVC(kernel="linear").fit(X, y) svc
def plotPrediction(classifier):
= np.meshgrid(np.arange(-1, 1, 0.001), np.arange(-1, 1, 0.001))
xx, yy = xx.ravel(), yy.ravel()
npx, npy = np.c_[npx, npy] # concatenate
sample_points = classifier.predict(sample_points)
Z
= plt.subplots(figsize=(12, 6))
fig, ax = Z.reshape(xx.shape)
Z =0.3)
ax.contourf(xx, yy, Z, alpha0], X[:, 1], c=y)
ax.scatter(X[:,
plt.show()
if __name__ == "__main__":
plotPrediction(svc)
Principal Component Analysis
PCA is a dimentionality reduction technique which extracts the most important information of data and reserve the variability as best as it can.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
# Generate some random data
= np.random.randn(200, 5)
data
# Perform PCA on the data
= PCA()
pca
pca.fit(data)
# Get the explained variance ratio of each principal component
= pca.explained_variance_ratio_
explained_variance_ratio
# Get the principal components of the data
= pca.transform(data)
principal_components
# Plot the explained variance ratio of each principal component
plt.plot(explained_variance_ratio)"Principal component")
plt.xlabel("Explained variance ratio")
plt.ylabel(
plt.show()
# Plot the first two principal components
0], principal_components[:, 1])
plt.scatter(principal_components[:, "Principal component 1")
plt.xlabel("Principal component 2")
plt.ylabel( plt.show()
Ridge and Lasso Regression
This is the algorithm of the ridge regression seeks to minimize \[ \sum_{i=1}^n\left(y_i-\beta_0-\sum_{j=1}^p \beta_j x_{i j}\right)^2+\lambda \sum_{j=1}^p \beta_j^2=\mathrm{RSS}+\lambda \sum_{j=1}^p \beta_j^2 \] where \(\lambda\) is a tuning parameter. Note that shrinking penalty does not include constant term \(\beta_0\).
The motivation of using ridge over OLS is due to bias-variance trade-off. Ridge regression addresses overfitting by adding a regularization term to the objective function.
Lasso regression seeks to minimize \[ \sum_{i=1}^n\left(y_i-\beta_0-\sum_{j=1}^p \beta_j x_{i j}\right)^2+\lambda \sum_{j=1}^p\left|\beta_j\right|=\operatorname{RSS}+\lambda \sum_{j=1}^p\left|\beta_j\right| . \]
I suspected Lasso regression has a bug in Scikit, when running with linear regression object, lasso is showing a horizontal line.
from sklearn.linear_model import LinearRegression, Ridge, Lasso
= np.random.rand(100, 1)
X = 2 + 3 * X + np.random.rand(100, 1)
y
# Split the data into training and test sets
= train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test
# ols_model = LinearRegression()
= Ridge(alpha=1.5)
ridge_model = Lasso(alpha=0.5)
lasso_model
# ols_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)
# Make predictions on the test set
# ols_predictions = ols_model.predict(X_test)
= ridge_model.predict(X_test)
ridge_predictions = lasso_model.predict(X_test)
lasso_predictions
# Evaluate the models' performance
# ols_r2 = ols_model.score(X_test, y_test)
= ridge_model.score(X_test, y_test)
ridge_r2 = lasso_model.score(X_test, y_test)
lasso_r2
# Plot the results
= plt.subplots(figsize=(12, 6))
fig, ax
ax.scatter(X, y)# ax.plot(X, ols_model.predict(X), label='OLS, $R^2 = {:.3f}$'.format(ols_r2))
="Ridge, $R^2 = {:.3f}$".format(ridge_r2))
ax.plot(X, ridge_model.predict(X), label="Lasso, $R^2 = {:.3f}$".format(lasso_r2))
ax.plot(X, lasso_model.predict(X), label
ax.legend() plt.show()
Elastic-Net Regression
If you combine ridge and lasso regression, you obtain \[ \sum_{i=1}^n\left(y_i-\beta_0-\sum_{j=1}^p \beta_j x_{i j}\right)^2+\lambda \sum_{j=1}^p \beta_j^2=\mathrm{RSS}+\lambda_1 \sum_{j=1}^p \beta_j^2 + \lambda_2 \sum_{j=1}^p\left|\beta_j\right| \]
from sklearn.linear_model import ElasticNet
= np.random.rand(100, 1)
X = 2 + 3 * X + np.random.rand(100, 1)
y
# Split the data into training and test sets
= train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test
= LinearRegression().fit(X_train, y_train)
ols_model = Ridge(alpha=1.5).fit(X_train, y_train)
ridge_model = Lasso(alpha=0.5).fit(X_train, y_train)
lasso_model = ElasticNet(alpha=0.1, l1_ratio=0.6).fit(X_train, y_train)
enet_model
# Make predictions on the test set
= ols_model.predict(X_test)
ols_predictions = ridge_model.predict(X_test)
ridge_predictions = lasso_model.predict(X_test)
lasso_predictions = lasso_model.predict(X_test)
enet_predictions
# Evaluate the models' performance
= ols_model.score(X_test, y_test)
ols_r2 = ridge_model.score(X_test, y_test)
ridge_r2 = lasso_model.score(X_test, y_test)
lasso_r2 = enet_model.score(X_test, y_test)
enet_r2
# Plot the results
= plt.subplots(figsize=(12, 6))
fig, ax
ax.scatter(X, y)="OLS, $R^2 = {:.3f}$".format(ols_r2))
ax.plot(X, ols_model.predict(X), label="Ridge, $R^2 = {:.3f}$".format(ridge_r2))
ax.plot(X, ridge_model.predict(X), label="Lasso, $R^2 = {:.3f}$".format(lasso_r2))
ax.plot(X, lasso_model.predict(X), label="E-L, $R^2 = {:.3f}$".format(enet_r2))
ax.plot(X, enet_model.predict(X), label
ax.legend() plt.show()
K-Nearest Neighbors
from sklearn.datasets import make_classification
from sklearn.neighbors import KNeighborsClassifier
# Generate some random data
= make_classification(
X, y =200,
n_samples=2,
n_features=0,
n_redundant=2,
n_informative=1,
n_clusters_per_class=4,
n_classes=1,
random_state
)
# Split the data into training and test sets
= train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_test, y_train, y_test
# Create a k-NN classifier
= KNeighborsClassifier(n_neighbors=6)
knn
# Train the classifier on the training data
knn.fit(X_train, y_train)
# Test the classifier on the test data
= knn.score(X_test, y_test)
accuracy print(f"Test accuracy: {accuracy:.2f}")
# Plot the data
= plt.subplots(figsize=(14, 7))
fig, ax 0], X[:, 1], c=y)
ax.scatter(X[:,
# Define a grid of points to classify
= X[:, 0].min() - 1, X[:, 0].max() + 1
x_min, x_max = X[:, 1].min() - 1, X[:, 1].max() + 1
y_min, y_max = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
xx, yy
# Classify the points on the grid
= knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z
# Plot the decision boundary
= Z.reshape(xx.shape)
Z ="RdBu", alpha=0.5)
ax.contour(xx, yy, Z, cmap plt.show()
Test accuracy: 0.72